'''
词频分析主模块
'''
import asyncio
import collections
import time

from datastore import datastore
from model.models import modelHtmlText, modelWordCount
import statistics
from log import log
import settings

logging = log(filename=settings.WORDCOUNT_LOG_FILE, level=log.INFO)

class wordCount:
    '''
    读取数据库htmltext表
    '''
    SLEEP_TIME = 2
    CYCLE_GAP_TIME = 0.1
    IGNORE_WORD = [chr(x) for x in range(ord('a'), ord('z') + 1) if chr(x) not in 'ai'] 
    
    def __init__(self):
        self.datastore = datastore()
        self.STAT_FUNC = {'wikipedia' : self.__wikiStat}
        self.logging = logging

    def __wikiStat(self, wordList):
        '''
        处理wikipedia单词的统计
        '''
        statistics.addwikiDoneLinkCount(1)
        statistics.decwikiPreAnalyzeHtmlCount(1)
        statistics.addwikiWordCount(len(wordList))

    async def count(self):
        '''
        统计htmltext中每条记录的词频，然后写入到wordcount表，异步执行启动方式
        '''
        try:
            while True:
                textObj = self.datastore.top(modelHtmlText)
                if not textObj:
                    await asyncio.sleep(wordCount.SLEEP_TIME)
                    continue
                site = textObj.site
                text = textObj.text
                textList = text.split()
                textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
                wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割，然后组成（单词，词频）的tuple的list
                self.datastore.updateWordCount(wordTuples)
                self.STAT_FUNC[site](textList)
                await asyncio.sleep(wordCount.CYCLE_GAP_TIME)
        except Exception as e:
            self.logging.error("wordCount count exception={}".format(e))

    def syncCount(self):
        '''
        统计htmltext中每条记录的词频，然后写入到wordcount表，同步执行启动方式
        '''
        try:
            while True:
                textObj = self.datastore.top(modelHtmlText)
                
                if not textObj:
                    self.logging.warning('no text')
                    time.sleep(wordCount.SLEEP_TIME)
                    continue
                site = textObj.site
                text = textObj.text
                
                textList = text.split()
                textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
                wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割，然后组成（单词，词频）的tuple的list
                self.datastore.updateWordCount(wordTuples)
                self.STAT_FUNC[site](textList)
        except Exception as e:
            self.logging.error("wordCount sync count exception={}".format(e))
            
            
def main():
    
    try:
        asyncio.run(wordCount().count())        
    except KeyboardInterrupt:
        pass

if __name__ == '__main__':
    main()